In [1]:
#Data Handling
import numpy as np
import pandas as pd
from sklearn.preprocessing import minmax_scale
#Visualization
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import display
import seaborn as sns
import plotly.offline as plotly
plotly.offline.init_notebook_mode()
from plotly.graph_objs import *
#Feature extraction
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage
from sklearn.cluster.bicluster import SpectralBiclustering, SpectralCoclustering
#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import NMF
#Misc
from scipy.signal import argrelextrema
from math import floor
from os import listdir
In [2]:
metadata = pd.read_csv("sample_sheet_Patrick_AdiposeMacs.txt", sep="\t")
metadata["#id"] = metadata["#id"].astype(str)
metadata
Out[2]:
#id flocell series lane il_barcode cel_barcode project
0 2 CALW2ANXX LFD L001 pool 2 Patrick_AdiposeMacs
1 6 CALW2ANXX HFD L001 pool 3 Patrick_AdiposeMacs
2 7 CALW2ANXX HFD L001 pool 4 Patrick_AdiposeMacs
3 11 CALW2ANXX HFD-SEA L001 pool 5 Patrick_AdiposeMacs
4 12 CALW2ANXX HFD-SEA L001 pool 6 Patrick_AdiposeMacs
5 13 CALW2ANXX HFD-SEA L001 pool 7 Patrick_AdiposeMacs
6 14 CALW2ANXX HFD-SEA L001 pool 8 Patrick_AdiposeMacs
7 16 CALW2ANXX HFD-pLex L001 pool 9 Patrick_AdiposeMacs
8 17 CALW2ANXX HFD-pLex L001 pool 10 Patrick_AdiposeMacs
9 18 CALW2ANXX HFD-pLex L001 pool 11 Patrick_AdiposeMacs
10 20 CALW2ANXX HFD-pLex L001 pool 12 Patrick_AdiposeMacs
In [3]:
#Import data from files
study_data = None
for fname in listdir("htseq_count/"):
    exp_id_long = fname.split("_")[4].split(".")[0]
    exp_id = exp_id_long.lstrip("0")
    exp_data = pd.read_csv("htseq_count/"+fname, sep="\t", header=None, names=["gene", str(exp_id)])
    if study_data is None:
        study_data = exp_data
    else:      
        study_data = study_data.merge(exp_data, on="gene", how="outer")
#Clean up        
study_data.index = study_data["gene"].values
del study_data["gene"]
study_data = study_data.T
study_data = study_data.merge(metadata, left_index=True, right_on="#id")
study_data.index = study_data.apply(lambda x: x["#id"] + " " + x["series"], axis=1)
sample_readinfo = study_data.iloc[:,-12:-1]
study_data = study_data.iloc[:,:-12]
#Display dfs
display(study_data.head())
display(sample_readinfo.head())
#Sanity check
if study_data.isnull().sum().sum() != 0:
    print("Some genes missing from count file")
0610005C13Rik 0610007P14Rik 0610009B22Rik 0610009L18Rik 0610009O20Rik 0610010B08Rik 0610010F05Rik 0610010K14Rik 0610011F06Rik 0610012G03Rik ... Zxda Zxdb Zxdc Zyg11a Zyg11b Zyx Zzef1 Zzz3 a l7Rn6
16 HFD-pLex 0 28 14 4 16 0 13 127 7 243 ... 6 10 15 0 22 242 8 15 0 30
17 HFD-pLex 0 81 23 17 62 0 64 310 23 598 ... 18 19 48 0 68 649 38 38 0 139
18 HFD-pLex 0 24 11 5 20 0 10 92 10 170 ... 6 2 15 0 14 188 4 8 0 31
20 HFD-pLex 0 17 11 3 16 0 12 66 0 131 ... 5 3 10 0 13 166 23 5 0 26
11 HFD-SEA 0 63 26 3 40 0 28 213 27 400 ... 11 7 39 0 24 388 32 16 0 106

5 rows × 24421 columns

no_feature ambiguous too_low_aQual not_aligned alignment_not_unique #id flocell series lane il_barcode cel_barcode
16 HFD-pLex 389086 8790 3773792 962931 0 16 CALW2ANXX HFD-pLex L001 pool 9
17 HFD-pLex 1161915 26330 9982554 2104229 0 17 CALW2ANXX HFD-pLex L001 pool 10
18 HFD-pLex 254857 6185 2750164 612610 0 18 CALW2ANXX HFD-pLex L001 pool 11
20 HFD-pLex 293660 7165 2463895 619809 0 20 CALW2ANXX HFD-pLex L001 pool 12
11 HFD-SEA 709937 17129 6558013 1738422 0 11 CALW2ANXX HFD-SEA L001 pool 5
In [4]:
count_matrix = minmax_scale(study_data.as_matrix(), axis=0)
heatmap = Data([
    Heatmap(
        x = study_data.columns, 
        y = study_data.index,
        z = count_matrix,
        colorscale='Viridis'
    )
])
layout = Layout(
    title='Gene Expression Heatmap',
    yaxis=dict(type="category"),
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
C:\Users\jpole\Anaconda3\lib\site-packages\sklearn\utils\validation.py:429: DataConversionWarning:

Data with input dtype int64 was converted to float64.

In [5]:
p = sns.barplot(study_data.index, study_data.sum(axis=1).as_matrix())
p.set_xticklabels(study_data.index, rotation=90)
plt.title("Gene Count Intensity Per Experiment")
plt.show()
In [6]:
norm_count_matrix = study_data.as_matrix() / study_data.sum(axis=1)[:, None]
norm_count_matrix = norm_count_matrix[:, norm_count_matrix.sum(axis=0) > 0]
norm_count_matrix = minmax_scale(norm_count_matrix)
print(count_matrix.shape, norm_count_matrix.shape)
heatmap = Data([
    Heatmap(
        x = study_data.columns, 
        y = study_data.index,
        z = norm_count_matrix,
        colorscale='Viridis'
    )
])
layout = Layout(
    title='Gene Expression Heatmap Normalized By Per Experiment Intensity',
    yaxis=dict(type="category"),
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
(11, 24421) (11, 15665)
In [7]:
def plotClusters(comp1, comp2, comp3, treatments, clusters, title=""):
    visrep = Scatter3d(
        x=comp1,
        y=comp2,
        z=comp3,
        text=treatments,
        mode='markers',
        marker=dict(
            size=12,
            color=clusters,
            colorscale='Viridis',
            opacity=0.8,
            line = dict(
                width = 0,
                color = "black"
            )
        )
    )
    fig = Figure(data=[visrep], layout=Layout(title=title))
    plotly.iplot(fig)
In [8]:
#PCA Decomposition to support later visualization
dim_red = PCA(n_components=10)
dim_red_genes = dim_red.fit_transform(norm_count_matrix)
print("% Variance Explained:", dim_red.explained_variance_ratio_.sum()*100)
plotClusters(dim_red_genes.T[0], dim_red_genes.T[1], dim_red_genes.T[2], 
             study_data.index, sample_readinfo["series"].astype("category").cat.rename_categories([0,1,2,3]), 
             "Experimental Groups on 3 PCs")
% Variance Explained: 100.0
In [9]:
biclust = SpectralBiclustering(n_clusters=4)
biclust.fit(norm_count_matrix)
fit_data = np.asarray(norm_count_matrix)[np.argsort(biclust.row_labels_)]
fit_data = fit_data[:, np.argsort(biclust.column_labels_)]

heatmap = Data([
    Heatmap(
        y = study_data.index[np.argsort(biclust.row_labels_)], 
        x = study_data.columns[np.argsort(biclust.column_labels_)],
        z = fit_data,    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='Spectral CoClustering of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [10]:
#NMF Decomposition
nmf_decomp = NMF(n_components=4)
nmf_genes = nmf_decomp.fit_transform(norm_count_matrix, sample_readinfo["series"])
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_genes),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Decomposition of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [52]:
def nmf_cluster(nmf_data, threshold):
    nmf_data = pd.DataFrame(nmf_data)
    return nmf_data.apply(lambda x: x > threshold, axis=0);
step_size = 1.75;
min_val=0
max_val=10
tuning_range = np.arange(min_val, max_val+step_size, step_size);
order = .25*(max_val-min_val)/step_size
tuning_data = [(nmf_cluster(nmf_genes, i).sum(axis=1) == 1).sum()/nmf_genes.shape[0] for i in tuning_range]
plt.plot(tuning_range, tuning_data)
lextrema = argrelextrema(np.asarray(tuning_data), np.greater, order=floor(order))[0]
for lmin in tuning_range[lextrema]:
    plt.axvline(x=lmin, color="red", lw=.5)
plt.show()
In [53]:
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_cluster(nmf_genes, tuning_range[lextrema[0]])).astype(int),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Soft Clustering Assignments of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [54]:
#NMF Decomposition
nmf_decomp = NMF(n_components=5)
nmf_genes = nmf_decomp.fit_transform(norm_count_matrix, sample_readinfo["series"])
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_genes),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Decomposition of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
def nmf_cluster(nmf_data, threshold):
    nmf_data = pd.DataFrame(nmf_data)
    return nmf_data.apply(lambda x: x > threshold, axis=0);
step_size = 1.75;
min_val=0
max_val=10
tuning_range = np.arange(min_val, max_val+step_size, step_size);
order = .25*(max_val-min_val)/step_size
tuning_data = [(nmf_cluster(nmf_genes, i).sum(axis=1) == 1).sum()/nmf_genes.shape[0] for i in tuning_range]
plt.plot(tuning_range, tuning_data)
lextrema = argrelextrema(np.asarray(tuning_data), np.greater, order=floor(order))[0]
for lmin in tuning_range[lextrema]:
    plt.axvline(x=lmin, color="red", lw=.5)
plt.show()
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_cluster(nmf_genes, tuning_range[lextrema[0]])).astype(int),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Soft Clustering Assignments of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [39]:
sns.distplot(np.std(norm_count_matrix, axis=0))
#Limit features
gene_filter = np.std(norm_count_matrix, axis=0) > .3
limited_study_data = study_data.loc[:, gene_filter]
limited_count_matrix = norm_count_matrix[:, gene_filter]
print(count_matrix.shape, limited_count_matrix.shape)
(11, 24421) (11, 7248)
In [13]:
heatmap = Data([
    Heatmap(
        y = limited_study_data.columns.values, 
        x = limited_study_data.index,
        z = limited_count_matrix.T,
        colorscale='Viridis'
    )
])
layout = Layout(
    title='Limited Gene Expression Heatmap',
    yaxis=dict(type="category")
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [ ]: